Introduction

This script creates a correlation matrix between all variables of our data set, using Cramers V and the the sample we will also use for model specification. Further, we will create groups for highly correlated Variables and select one to represent this group.

remove(list = ls())
library(reactable)
library(sf)
library(data.table)
library(tidyverse)
library(corrplot)
library(reshape2)
library(igraph)
sample <- readRDS("../data_raw/samples/sample_for_model_building.RDS")

Cramers V

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
sample$aestudio %>% is.na() %>% table()
## .
## FALSE 
##  2260
sample <- sample %>% as.data.frame()


### filter with factors over 15
idx <- sapply(sample,FUN = function(x) length(unique(x)) > 15)
idx %>% table()
## .
## FALSE  TRUE 
##   128    36
sample2 <- sample[, !idx]

### filter with na percentage higher then .5
idx <- sapply(sample2,FUN = function(x) mean(is.na(x)) > .5)
idx %>% table()
## .
## FALSE  TRUE 
##    95    33
sample2 <-  sample2[, !idx]

### other non informative variables filter 
sample2 <- sample2[,!colnames(sample2) %in% c("idep","imun")]

sample_fast <- data.frame(lapply(sample2, function(x) {
  if (is.factor(x)) as.numeric(x) else x
}))

sample2
library(vcd)
## Loading required package: grid
tmp_data <- matrix(ncol = 3,nrow = 0) %>% as.data.frame()
colnames(tmp_data) <- c("v1","v2","cramresV")

for(i in seq_along(sample2)){
  for(j in seq_along(sample2)){
  x1 <- sample2[,i] %>% as.factor()
  x2 <- sample2[,j] %>% as.factor()
  tbl <- table(x1, x2)
  tmp_vec <- c(colnames(sample2)[i],colnames(sample2)[j],assocstats(tbl)$cramer[[1]]) %>% unlist()
  tmp_data <- rbind(tmp_data,tmp_vec)
    
  }
}

colnames(tmp_data) <- c("v1","v2","cramresV")

hist

tmp_data$cramresV <- tmp_data$cramresV %>% as.numeric
tmp_data$cramresV %>% hist()

This is a histogramm showing the distribution of the correlation between all variables.

corr matrix

cram_mat <- tmp_data %>%
  filter(v1 != v2) %>%                  # remove self-pairs
  pivot_wider(
    names_from  = v2,
    values_from = cramresV
  ) %>%
  column_to_rownames("v1") %>%
  as.matrix()

dim(cram_mat)
## [1] 93 93
corrplot(
  cram_mat,
  method = "color",
  type   = "lower",
  tl.cex = 0.6
)

cram_long <- melt(cram_mat)

corr_matrix_2 <-  ggplot(cram_long, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_viridis_c() +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, size = 6),
    axis.text.y = element_text(size = 6)
  ) +
  labs(fill = "Cramér's V")

corr_matrix_2

ggsave("../output/corr_matrix-2.png",plot = corr_matrix_2,width = 10,height = 6, device = "png",dpi = 300)
dat_selection <- tmp_data

### drop the pair with itself
dat_selection <- dat_selection[dat_selection$v1 != dat_selection$v2,]
dat_selection <- dat_selection[!is.na(dat_selection$cramresV),]
dat_selection$cramresV %>% hist()

dat_selection %>% reactable(.,compact = T,filterable = T)
### load in list from. A
list_variables <- readRDS("../data_raw/misc/list_variables_seleciton.RDS")

### filter for only these variables
dat_selection <- dat_selection[dat_selection$v1 %in% list_variables,]

dat_selection$pair_sorted <- apply(dat_selection[, c("v1","v2")], 1, function(x) paste(sort(x), collapse = "_"))
dat_selection_unique <- dat_selection[!duplicated(dat_selection$pair_sorted), ]

dat_selection_unique[,1:3] %>% reactable(.,compact = T,filterable = T)
# dat_selection_unique: v1, v2, cramresV
threshold <- 0.75

# keep only strong pairs
strong_pairs <- dat_selection_unique %>% filter(cramresV > threshold)

# build graph
g <- graph_from_data_frame(strong_pairs[, c("v1", "v2")], directed = FALSE)

# find connected components
comp <- components(g)

# create a dataframe with groups
group_df <- data.frame(
  var = names(comp$membership),
  group = comp$membership
)

# optional: collapse variables in each group into one row
grouped_vars <- group_df %>%
  group_by(group) %>%
  summarise(vars = paste(var, collapse = ", "), .groups = "drop")

reactable(grouped_vars)